https://www.tidytextmining.com/
library(rtweet)
library(tidyverse)
library(tidytext)
library(wordcloud2)
search_tweets
tweet_collection <- search_tweets("marchmadness", n=1000, lang = "en")
Downloading [=======>---------------------------------] 20%
Downloading [===========>-----------------------------] 30%
Downloading [===============>-------------------------] 40%
Downloading [===================>---------------------] 50%
Downloading [========================>----------------] 60%
Downloading [============================>------------] 70%
Downloading [================================>--------] 80%
Downloading [====================================>----] 90%
Downloading [=========================================] 100%
tweet_collection <- tweet_collection %>%
filter(is_retweet == "FALSE")
tweet_collection
tweets_by_tweeter <- tweet_collection %>%
group_by(screen_name) %>%
mutate(line = row_number()) %>%
ungroup()
tweets_by_tweeter %>%
count(screen_name, sort = TRUE)
glimpse(tweets_by_tweeter)
Rows: 466
Columns: 91
$ user_id <chr> "935344574", "409272386", "777159887495147525", "326586556", "847035646376132608", "...
$ status_id <chr> "1324087223030079489", "1324084896235724807", "1324082875323584512", "13240821973898...
$ created_at <dttm> 2020-11-04 20:32:48, 2020-11-04 20:23:34, 2020-11-04 20:15:32, 2020-11-04 20:12:50,...
$ screen_name <chr> "Indychick31", "amybuddy45", "MegaOmegaNes", "NickRevell317", "ckylaurie", "LancePot...
$ text <chr> "@NickRevell317 @marchmadness Feels more like Pacers/Knicks ECF", "@TowsonTigers @To...
$ source <chr> "Twitter for iPhone", "Twitter for Android", "Twitter for iPhone", "Twitter for Andr...
$ display_text_width <dbl> 33, 38, 72, 40, 66, 216, 26, 119, 230, 237, 238, 197, 66, 29, 137, 41, 69, 202, 53, ...
$ reply_to_status_id <chr> "1324082197389844480", "1323643115266256897", "1324078964911071232", "13240808189989...
$ reply_to_user_id <chr> "326586556", "28380954", "32463369", "326586556", "88001676", NA, "3002054440", NA, ...
$ reply_to_screen_name <chr> "NickRevell317", "TowsonTigers", "SouthlandSports", "NickRevell317", "notaxation", N...
$ is_quote <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, F...
$ is_retweet <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
$ favorite_count <int> 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 2, 5, 0, 3, 2, 0, 0, 1, 38, 0, 0, 1, 1,...
$ retweet_count <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 1, ...
$ quote_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ reply_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ hashtags <list> [NA, NA, NA, NA, NA, <"MarchMadness", "Vote2020", "Vote", "Election2020", "Election...
$ symbols <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ urls_url <list> [NA, NA, NA, NA, NA, NA, NA, "twitter.com/marchmadness/s…", NA, NA, NA, NA, NA, NA,...
$ urls_t.co <list> [NA, NA, NA, NA, NA, NA, NA, "https://t.co/7dpNdQUJub", NA, NA, NA, NA, NA, NA, NA,...
$ urls_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, "https://twitter.com/marchmadness/status/1321453097848...
$ media_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "h...
$ media_t.co <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "h...
$ media_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "h...
$ media_type <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "p...
$ ext_media_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "h...
$ ext_media_t.co <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "h...
$ ext_media_expanded_url <list> [NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "h...
$ ext_media_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ mentions_user_id <list> [<"326586556", "202416362">, <"28380954", "320568857", "312784438", "611927090", "7...
$ mentions_screen_name <list> [<"NickRevell317", "marchmadness">, <"TowsonTigers", "Towson_MBB", "Towson_WBB", "T...
$ lang <chr> "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", "en", ...
$ quoted_status_id <chr> NA, NA, NA, NA, NA, NA, NA, "1321453097848393730", NA, NA, NA, NA, NA, NA, NA, NA, N...
$ quoted_text <chr> NA, NA, NA, NA, NA, NA, NA, "Four. More. Weeks. \U0001f64c https://t.co/20rsOT5FVL",...
$ quoted_created_at <dttm> NA, NA, NA, NA, NA, NA, NA, 2020-10-28 14:05:44, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ quoted_source <chr> NA, NA, NA, NA, NA, NA, NA, "Twitter Web App", NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ quoted_favorite_count <int> NA, NA, NA, NA, NA, NA, NA, 1244, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ quoted_retweet_count <int> NA, NA, NA, NA, NA, NA, NA, 227, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ quoted_user_id <chr> NA, NA, NA, NA, NA, NA, NA, "202416362", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ quoted_screen_name <chr> NA, NA, NA, NA, NA, NA, NA, "marchmadness", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ quoted_name <chr> NA, NA, NA, NA, NA, NA, NA, "NCAA March Madness", NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ quoted_followers_count <int> NA, NA, NA, NA, NA, NA, NA, 1420876, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ quoted_friends_count <int> NA, NA, NA, NA, NA, NA, NA, 816, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ quoted_statuses_count <int> NA, NA, NA, NA, NA, NA, NA, 29878, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ quoted_location <chr> NA, NA, NA, NA, NA, NA, NA, "", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ quoted_description <chr> NA, NA, NA, NA, NA, NA, NA, "The official NCAA March Madness destination for all thi...
$ quoted_verified <lgl> NA, NA, NA, NA, NA, NA, NA, TRUE, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ retweet_status_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_text <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_created_at <dttm> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
$ retweet_source <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_favorite_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_retweet_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_user_id <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_screen_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_followers_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_friends_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_statuses_count <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_location <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_description <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ retweet_verified <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ place_url <chr> NA, NA, "https://api.twitter.com/1.1/geo/id/ebf78e870cecf27c.json", "https://api.twi...
$ place_name <chr> NA, NA, "Nacogdoches", "Indiana", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ place_full_name <chr> NA, NA, "Nacogdoches, TX", "Indiana, USA", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
$ place_type <chr> NA, NA, "city", "admin", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "admin"...
$ country <chr> NA, NA, "United States", "United States", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
$ country_code <chr> NA, NA, "US", "US", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "US", NA, NA...
$ geo_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <N...
$ coords_coords <list> [<NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <NA, NA>, <N...
$ bbox_coords <list> [<NA, NA, NA, NA, NA, NA, NA, NA>, <NA, NA, NA, NA, NA, NA, NA, NA>, <-94.70422, -9...
$ status_url <chr> "https://twitter.com/Indychick31/status/1324087223030079489", "https://twitter.com/a...
$ name <chr> "Indychick", "Amy Pullifrone", "Leon", "Nick Revell", "Ckylaurie", "Lance Potts", "\...
$ location <chr> "", "Maryland", "Deep Deep Deep East, Tx", "Indianapolis, Indiana", "Winnipeg, Manit...
$ description <chr> "In 49 states it’s just basketball... but THIS...is INDIANA", "A wife, mother, daugh...
$ url <chr> NA, NA, NA, NA, NA, NA, NA, "https://t.co/3vvVhjYDZN", "https://t.co/f4Ba651McX", "h...
$ protected <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
$ followers_count <int> 420, 259, 222, 2000, 41, 284, 147, 2694, 63, 63, 63, 63, 1, 440, 1963, 67, 319, 1158...
$ friends_count <int> 750, 2432, 419, 1765, 149, 462, 255, 81, 270, 270, 270, 270, 174, 2367, 820, 163, 52...
$ listed_count <int> 25, 12, 1, 24, 1, 28, 0, 53, 0, 0, 0, 0, 0, 18, 4, 0, 2, 64, 0, 12, 0, 0, 66, 453, 0...
$ statuses_count <int> 59228, 4484, 3123, 102132, 1667, 19894, 5021, 11340, 2309, 2309, 2309, 2309, 41, 771...
$ favourites_count <int> 54457, 2036, 10867, 314, 1745, 16379, 18108, 4472, 240, 240, 240, 240, 22, 20579, 66...
$ account_created_at <dttm> 2012-11-08 20:19:08, 2011-11-10 14:18:09, 2016-09-17 14:58:50, 2011-06-30 04:41:02,...
$ verified <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, ...
$ profile_url <chr> NA, NA, NA, NA, NA, NA, NA, "https://t.co/3vvVhjYDZN", "https://t.co/f4Ba651McX", "h...
$ profile_expanded_url <chr> NA, NA, NA, NA, NA, NA, NA, "https://pickscity.com/shop/laker_crazy-daily/", "http:/...
$ account_lang <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ profile_banner_url <chr> "https://pbs.twimg.com/profile_banners/935344574/1525299925", "https://pbs.twimg.com...
$ profile_background_url <chr> "http://abs.twimg.com/images/themes/theme1/bg.png", "http://abs.twimg.com/images/the...
$ profile_image_url <chr> "http://pbs.twimg.com/profile_images/2825347118/66387d87e20f42982aad1ee2f6fc4e84_nor...
$ line <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, ...
"Because we have kept text such as hashtags and usernames in the dataset, we can’t use a simple anti_join() to remove stop words. Instead, we can take the approach shown in the filter() line that uses str_detect() from the stringr package. – https://www.tidytextmining.com/twitter.html
tweets_tokenized <- tweets_by_tweeter %>%
select(text, screen_name, line) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]"))
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
tweets_tokenized
head(stopwordslangs)
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
anti_join(stopwordslangs) # anti_join(tidytext::get_stopwords())
Joining, by = "word"
frequency <- tweets_tokenized %>%
group_by(screen_name) %>%
count(word, sort = TRUE) %>%
left_join(tweets_tokenized %>%
group_by(screen_name) %>%
summarise(total = n())) %>%
mutate(freq = n/total)
`summarise()` ungrouping output (override with `.groups` argument)
Joining, by = "screen_name"
frequency
"This is a nice and tidy data frame but we would actually like to plot those frequencies on the x- and y-axes of a plot, so we will need to use spread() from tidyr make a differently shaped data frame. – https://www.tidytextmining.com/twitter.html
pivot_wider
frequency <- frequency %>%
select(screen_name, word, freq) %>%
pivot_wider(names_from = screen_name, values_from = freq) #, values_fill = 0)
frequency
tweets_tokenized %>%
# group_by(screen_name) %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
anti_join(stopwordslangs) %>%
wordcloud2()
Joining, by = "word"
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
filter(!str_detect(word, "^\\@")) %>%
slice_head(n = 30) %>%
ggplot(aes(freq, fct_reorder(word, freq))) +
geom_col()
tweets_tokenized %>%
count(word, sort = TRUE, name = "freq") %>%
anti_join(stopwordslangs) %>%
filter(!str_detect(word, "^\\@")) %>%
slice_head(n = 30) %>%
ggplot(aes(freq, fct_reorder(word, freq))) +
geom_col()
Joining, by = "word"
ggplot(frequency, aes(LouInPain, Dukeballnation)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = scales::percent_format()) +
scale_y_log10(labels = scales::percent_format()) +
geom_abline(color = "firebrick")
# fs::dir_create("images")
# ggsave("images/dukeball.png")
# "CBBCent1" | screen_name == "Adam_Bradford1
# marchmadness TheAndyKatz
ggplot(frequency, aes(marchmadness, TheAndyKatz)) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = scales::percent_format()) +
scale_y_log10(labels = scales::percent_format()) +
geom_abline(color = "firebrick")
tweets_by_tweeter %>%
summarise(min_date = min(created_at), max_date = max(created_at))
word_ratios <- tweets_tokenized %>%
# filter(screen_name == "CBBCent1" | screen_name == "Adam_Bradford14") %>%
filter(screen_name == "LouInPain" | screen_name == "Dukeballnation") %>%
filter(!str_detect(word, "^@")) %>%
count(word, screen_name) %>%
group_by(word) %>%
filter(sum(n) >= 2) %>%
ungroup() %>%
pivot_wider(names_from = screen_name, values_from = n, values_fill = 0) %>%
mutate_if(is.numeric, list(~(. + 1) / (sum(.) + 1))) %>%
mutate(logratio = log(LouInPain / Dukeballnation)) %>%
arrange(desc(logratio))
Error: Problem with `mutate()` input `logratio`.
x object 'LouInPain' not found
i Input `logratio` is `log(LouInPain/Dukeballnation)`.
Run `rlang::last_error()` to see where the error occurred.
word_ratios %>%
arrange(abs(logratio))
word_ratios %>%
group_by(logratio < 0) %>%
top_n(15, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_col() + #show.legend = FALSE) +
coord_flip() +
ylab("log odds ratio (CCBCent1/Adam_Bradford14)") +
scale_fill_discrete(name = "", labels = c("LouInPain", "Dukeballnation"))
https://www.tidytextmining.com/twitter.html#favorites-and-retweets
https://www.tidytextmining.com/twitter.html#changes-in-word-use
{r}
# dtm <- DocumentTermMatrix(docs)
dtm2 <- TermDocumentMatrix(corpus)
m <- as.matrix(dtm2)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
d <- d %>%
slice(2:200)
https://www.tidytextmining.com/tfidf.html#the-bind_tf_idf-function
tweet_words <- tweets_by_tweeter %>%
select(screen_name, text, status_id, user_id) %>%
unnest_tokens(word, text, token = "tweets") %>%
filter(!str_detect(word, "^\\@")) %>%
filter(!str_detect(word, "^http")) %>%
anti_join(stopwordslangs) %>%
count(word, tweeter = screen_name, sort = TRUE)
Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
Joining, by = "word"
tweet_words
total_words <- tweet_words %>%
group_by(tweeter) %>%
summarize(total = sum(n)) %>%
arrange(-total)
`summarise()` ungrouping output (override with `.groups` argument)
total_words
tweet_words <- left_join(tweet_words, total_words)
Joining, by = "tweeter"
tweet_words
tweet_words %>%
bind_tf_idf(word, tweeter, n)
tweet_words %>%
bind_tf_idf(word, tweeter, n) %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
filter(n > 2) %>%
# group_by(tweeter) %>%
# top_n(2) %>%
# ungroup() %>%
ggplot(aes(word, tf_idf)) +
geom_col() +
facet_wrap(~ tweeter) +
coord_flip()
http://antonio-ferraro.eu.pn/word-clouds-in-r-packages-wordcloud2-and-tm/
https://jrnold.github.io/qss-tidy/discovery.html#textual-data
https://rstudio-pubs-static.s3.amazonaws.com/31867_8236987cf0a8444e962ccd2aec46d9c3.html
of less use